***************
* This file prepares the main oste dataset
* Author: Daniel Kopp
***************

import delimited using "data_raw\20220407_SECO2KOF_Tab1_OSTE.csv" , clear

* All variables
local allvars ""
foreach v of varlist * {
	loc allvars "`allvars' `v'"
}
foreach var of varlist `allvars' {
	cap replace `var' = "" if `var'=="\N"
}

* Format of date variables
foreach i in  dat_freigabe_avam dat_abmeldung dat_gueltig_bis   ///
	dat_sperrfrist dat_stellenantritt dat_vertragsdauer {
	    g `i'_tmp = date(`i', "YMD")
		format `i'_tmp %tdCCYY-NN-DD
		drop  `i'
		rename  `i'_tmp `i'
}
	
drop if dat_freigabe_avam<td(01jul2018)		// 249
	
local date_var "dat_freigabe_avam"	
quietly sum `date_var'
local 	`date_var'_max = r(max)
local 	`date_var'_min = r(min)
disp string(``date_var'_min',"%td") 	// 01jul2018
disp string(``date_var'_max',"%td") 	// 30jun2021

* Save code and text  in separate file
rename cod_schrift cod_schriftlich 
rename schriftlich txt_schriftlich
rename muendlich txt_muendlich
rename abmeldegrund cod_oste_abmeldegrund 

foreach v in num_alter_von num_alter_bis cod_sprache cod_schriftlich cod_muendlich {
	destring `v', replace
}

* Generate dummy variables
cap lab def gender_lab 0 "Male" 1 "Female"
foreach i in rav {
	gen 	`i'_gender = 0 if `i'_geschlecht=="M"
	replace `i'_gender = 1 if `i'_geschlecht=="F"
	replace `i'_gender = . if `i'_geschlecht==""
	label values `i'_gender gender_lab
	drop `i'_geschlecht
}

* Generate dummy variables
cap lab def gender_lab 0 "Male" 1 "Female"
gen  	preferred_gender = 0 if gewuenschtes_geschlecht=="M"
replace preferred_gender = 1 if gewuenschtes_geschlecht=="F"
replace preferred_gender = . if gewuenschtes_geschlecht=="-1"
label values preferred_gender gender_lab
drop gewuenschtes_geschlecht

la def yesno 0 "no" 1 "yes", modify
foreach v of varlist meldepflicht extern_publikation extern_anonym intern_publikation intern_anonym  auftraggeber priv_fahrzeug {
	rename `v' tmp
	replace tmp = ustrtrim(tmp)
	g `v' = 1 if (tmp == "Y")
	replace `v' = 0 if (tmp == "N")
	replace `v' = . if (tmp == "")
	la val `v' yesno
	drop tmp	
}

la def yesno 0 "no" 1 "yes", modify
foreach v of varlist ab_sofort nach_vereinbarung vorselektion sonnfeiertagsarbeit schichtarbeit nachtarbeit heimarbeit lehre muttersprache {
	rename `v' tmp
	replace tmp = ustrtrim(tmp)
	g `v' = 1 if (tmp == "Y")
	replace `v' = 0 if (tmp == "N")
	replace `v' = . if (tmp == "-")
	la val `v' yesno
	drop tmp	
}

* Kurzeinsatz is always missing, but we have the variables cod_frist_typ and txt_frist_typ instead.
drop kurzeinsatz

gen 	befristet = 0 if cod_frist_typ=="UNBEFR"
replace befristet = 1 if cod_frist_typ=="KURZ"
replace befristet = 2 if cod_frist_typ=="BEFR"
replace befristet = . if txt_frist_typ=="-1"

label define befristet_lab 0 "Unlimited" 1 "Limited, less than 15 days" 2 "Limited, more than 14 days"
label values befristet befristet_lab

tab befristet, m

* The following variables are all in "oste_occupations" in long format
forvalues i = 1/3 {
	cap drop cod_b`i'_avam_de txt_b`i'_avam_de cod_b`i'_isco5 txt_b`i'_isco5_de cod_b`i'_erfahrung txt_b`i'_erfahrung cod_b`i'_ausbildungsniv txt_b`i'_ausbildungsniv cod_b`i'_qualifikation b`i'_qualifikation    
}

foreach i in  stand_meldepflicht noga_6st unternehmensgroesse arbeitsort fuehreraus_kat sprache schriftlich muendlich      {
	preserve
		distinct cod_`i'
		duplicates drop cod_`i', force
		keep cod_`i' txt_`i'
		save "Help_files\label_`i'.dta", replace
	restore

	drop txt_`i'  
}


* We have several duplicates in terms of oste_id_avam because the same ad may have several language requirements (cod_sprache) 
* However, as we will not be using the language information later, we can drop the duplicates:
 
drop cod_sprache cod_schriftlich cod_muendlich muttersprache
duplicates drop

* Check for more duplicates
duplicates tag oste_id_avam , gen(dup)

* One oste_id_avam has two cod_arbeitsort - we drop one of them
duplicates tag oste_id_avam cod_arbeitsort, gen(dup_arbeitsort)
bys oste_id_avam dup_arbeitsort: gen n_arbeitsort = _n
drop if dup==1 & dup_arbeitsort==0 & n_arbeitsort==2 
drop dup_arbeitsort n_arbeitsort

* One oste_id_avam has two burnr_ag (but one of those is missing - we drop it)
count if burnr_ag==-1 & dup>0
drop  if burnr_ag==-1 & dup>0

* All other remaining duplicates (9) oste_id_avam have two cod_berater - we drop one of them 

bys oste_id_avam dup: gen n_dup = _n
drop if dup==1 & n_dup==2 
drop dup n_dup


* Add information on the "Auftraggeber" and the legal form of the enterprise to the dataset, as well as a more comprehensive list of enterprise names (Table 4)
preserve
	import delimited using "data_raw\20220303_SECO2KOF_Tab4_OSTE.csv" , clear
	* Save cod_rechtsform and txt_rechtsform in separate file
	rename rechtsform_ag txt_rechtsform
	distinct cod_rechtsform		// 27
	duplicates drop cod_rechtsform, force
	keep cod_rechtsform txt_rechtsform
	save "Help_files\label_rechtsform.dta", replace
restore

preserve
	import delimited using "data_raw\20220303_SECO2KOF_Tab4_OSTE.csv" , clear
	* Save Bur-Nr and name of Auftraggeber in separate file
	distinct burnr_auftraggeber		// 24'065
	duplicates drop burnr_auftraggeber	, force
	keep burnr_auftraggeber	 name_auftraggeber	
	save "Help_files\label_bur_nr_auftraggeber.dta", replace
restore


preserve
	import delimited using "data_raw\20220303_SECO2KOF_Tab4_OSTE.csv" , encoding(utf8) clear

	drop rechtsform_ag name_auftraggeber

	rename burnr_ag burnr_ag_new 
	rename name_ag name_ag_new

	tempfile oste_companyinfo
	save `oste_companyinfo'
restore

merge 1:1 oste_id_avam using  `oste_companyinfo'
drop if _merge==2
drop _merge

replace burnr_ag = burnr_ag_new if burnr_ag==-1
drop burnr_ag_new

replace name_ag = name_ag_new if name_ag=="-1"
drop name_ag_new

* We don't need the company name
drop name_ag

distinct oste_id_avam		// 563'444

save "data_processed\oste.dta", replace

